#PCA - determine the most useful variables 

# Load necessary libraries
library(ggbiplot)  # Install with devtools::install_github("vqv/ggbiplot")
library(dplyr)
library(caret)
library(ggplot2)
library(plotly)
# Load the dataset
data(iris)
iris_df <- as.data.frame(iris)

# Remove duplicate rows
iris_df <- distinct(iris_df)

# Extract only numerical variables for PCA
sampled_var <- iris_df %>% select(-Species)
# Perform PCA (center and scale the data)
pca_results <- prcomp(sampled_var, center = TRUE, scale. = TRUE)  

# Summary of PCA (explains variance captured by each principal component)
summary(pca_results)
Importance of components:
                          PC1    PC2     PC3    PC4
Standard deviation     1.7087 0.9562 0.38111 0.1442
Proportion of Variance 0.7299 0.2286 0.03631 0.0052
Cumulative Proportion  0.7299 0.9585 0.99480 1.0000
# Scree plot - Proportion of Variance Explained (PVE)
pve <- (pca_results$sdev^2) / sum(pca_results$sdev^2)

plot(pve, xlab = "Principal Component", 
     ylab = "Proportion of Variance Explained (PVE)",
     main = "Scree Plot",
     pch = 20, cex = 2, col = "blue", type = "b")

# Cumulative Variance Explained
cpv <- cumsum(pve)

plot(cpv, xlab = "Principal Component", 
     ylab = "Cumulative Proportion of Variance Explained",
     main = "Cumulative Variance Explained",
     pch = 20, cex = 2, col = "blue", type = "b")

# Extract loading matrix (contribution of original variables to PCs)
pca_loadings <- pca_results$rotation
print(pca_loadings)  # Check which variables contribute most to each PC
                    PC1         PC2        PC3        PC4
Sepal.Length  0.5220620 -0.37334080  0.7203991  0.2628555
Sepal.Width  -0.2676921 -0.92485914 -0.2402154 -0.1235842
Petal.Length  0.5803119 -0.02431115 -0.1405854 -0.8017998
Petal.Width   0.5648277 -0.06827292 -0.6352617  0.5222557
PC1 (First Principal Component)

The first PC captures the most variance in the dataset.

Interpretation: PC1 is mainly influenced by petal size (both length & width), meaning it likely captures the overall size of the flower.

# Extract PCA scores (newly projected data in PC space)
pca_scores <- pca_results$x

# Visualizing PCA - Biplot
ggbiplot(pca_results, choices = c(1,2), obs.scale = 1, var.scale = 1,
         groups = iris_df$Species, ellipse = TRUE, var.axes = TRUE) + 
  theme_minimal() + 
  ggtitle("PCA Biplot of the Iris Dataset") +
  scale_color_manual(values = c("setosa" = "red", "versicolor" = "blue", "virginica" = "green"))

#3D PCA

# Extract PCA scores (newly projected data in PC space)
pca_scores <- as.data.frame(pca_results$x)

# Add species labels for coloring
pca_scores$Species <- iris_df$Species

# Define colors for species
my_colors <- c("setosa" = "red", "versicolor" = "blue", "virginica" = "green")

# Create an interactive 3D PCA plot
plot_ly(pca_scores, 
        x = ~PC1, 
        y = ~PC2, 
        z = ~PC3, 
        color = ~Species, 
        colors = my_colors, 
        type = "scatter3d", 
        mode = "markers") %>%
  layout(title = "3D PCA Visualization of Iris Dataset",
         scene = list(xaxis = list(title = "PC1"),
                      yaxis = list(title = "PC2"),
                      zaxis = list(title = "PC3")))